import os
import sys
import urllib.request
from typing import List, Dict, Set

# -----------------------------------------------------------------------------
# 1. REPLICATION OF THE C LOGIC
# This section mirrors your C implementation for perfect compatibility.
# -----------------------------------------------------------------------------

def crc32(data: List[int]) -> int:
    """
    Calculates a 32-bit CRC hash on a list of 32-bit integers (codepoints).
    This is a direct Python port of the C implementation.
    """
    crc = 0xFFFFFFFF
    poly = 0x04C11DB7

    if not data:
        return 0

    for item in data:
        for j in range(32):
            bit = (item >> (31 - j)) & 1
            if ((crc >> 31) & 1) != bit:
                crc = ((crc << 1) ^ poly)
            else:
                crc = (crc << 1)
            crc &= 0xFFFFFFFF
            
    return crc

def is_base_emoji(cp: int) -> bool:
    """
    Checks if a codepoint is a base emoji character.
    This now matches the updated C logic.
    """
    if 0x1F3FB <= cp <= 0x1F3FF:  # Skin Tones
        return False
    if cp == 0x200D:              # Zero Width Joiner
        return False
    # Gender signs (2640, 2642) and VS16 (FE0F) are now considered "base"
    # so they can be added to the component list.
    return True

def parse_codepoints_to_components(codepoints: List[int]) -> Dict:
    """
    Parses a list of codepoints into its primary and components.
    This logic now exactly matches the updated C parser.
    """
    components = {
        'primary_cp': 0,
        'component_list': [],
        'skin_tone1': 0,
        'skin_tone2': 0,
        # Flags are no longer used for gender or VS16
        'flags': 0,
        'hash': 0
    }
    
    for cp in codepoints:
        if 0x1F3FB <= cp <= 0x1F3FF:
            tone_val = (cp - 0x1F3FB) + 1
            if components['skin_tone1'] == 0:
                components['skin_tone1'] = tone_val
            elif components['skin_tone2'] == 0:
                components['skin_tone2'] = tone_val
        # REMOVED GENDER AND VS16 FLAG LOGIC.
        # They are now handled by is_base_emoji and added as components.
        elif is_base_emoji(cp):
            if components['primary_cp'] == 0:
                components['primary_cp'] = cp
            elif len(components['component_list']) < 16:
                components['component_list'].append(cp)

    if components['component_list']:
        components['hash'] = crc32(components['component_list'])
        
    return components

# -----------------------------------------------------------------------------
# 2. EMOJI DATA PROCESSING AND FILE GENERATION
# -----------------------------------------------------------------------------

UNICODE_VERSION = "15.1"
BASE_URL = f"https://unicode.org/Public/emoji/{UNICODE_VERSION}/"
FILE_LIST = [
    "emoji-sequences.txt",
    "emoji-test.txt",
    "emoji-zwj-sequences.txt",
]
OUTPUT_HEADER = "binmoji_table.h"

def download_file(url: str, filename: str) -> None:
    """Downloads a file if it doesn't exist locally."""
    if not os.path.exists(filename):
        sys.stderr.write(f"Downloading {filename} from Unicode Consortium...\n")
        try:
            urllib.request.urlretrieve(url, filename)
        except Exception as e:
            sys.stderr.write(f"Error downloading {url}: {e}\n")
            sys.exit(1)

def generate_c_header(hashes: Dict[int, List[int]]):
    """Writes the collected hash data to a C header file."""
    sys.stderr.write(f"Generating C header file: {OUTPUT_HEADER}\n")
    with open(OUTPUT_HEADER, 'w', encoding='utf-8') as f:
        f.write("#ifndef EMOJI_HASH_TABLE_H\n")
        f.write("#define EMOJI_HASH_TABLE_H\n\n")
        f.write("#include <stdint.h>\n\n")
        f.write("/* This file is auto-generated by generate_hash_table.py */\n\n")
        f.write("static const EmojiHashEntry binmoji_table[] = {\n")

        # Sort by hash for consistent output
        for hash_val, components in sorted(hashes.items()):
            comp_str = ", ".join([f"0x{c:X}" for c in components])
            f.write(f"    {{0x{hash_val:08X}, {len(components)}, {{{comp_str}}}}},\n")
        
        f.write("};\n\n")
        f.write("#endif /* EMOJI_HASH_TABLE_H */\n")
    sys.stderr.write("Header file generated successfully.\n")

def main():
    """Main function to process emoji data and generate the C header."""
    for filename in FILE_LIST:
        download_file(BASE_URL + filename, filename)
    
    unique_hashes: Dict[int, List[int]] = {}
    processed_sequences: Set[tuple] = set()

    for filename in FILE_LIST:
        sys.stderr.write(f"\n--- Processing {filename} ---\n")
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('#') or not line.strip():
                    continue
                
                codepoint_str = line.split(';')[0].strip()
                try:
                    codepoints = [int(cp, 16) for cp in codepoint_str.split()]
                except ValueError:
                    continue

                if tuple(codepoints) in processed_sequences:
                    continue
                processed_sequences.add(tuple(codepoints))

                parsed = parse_codepoints_to_components(codepoints)
                
                if parsed['hash'] != 0:
                    if parsed['hash'] in unique_hashes and unique_hashes[parsed['hash']] != parsed['component_list']:
                        sys.stderr.write(f"Collision detected! Hash 0x{parsed['hash']:08X} is shared.\n")
                    unique_hashes[parsed['hash']] = parsed['component_list']

    sys.stderr.write(f"\nFound {len(unique_hashes)} unique component sequences with hashes.\n")
    generate_c_header(unique_hashes)

if __name__ == "__main__":
    main()
